This Rmarkdown file assesses the output of CheckV, DeepVirFinder, Kaiju, VIBRANT, VirSorter, and VirSorter2 on multiple training sets of microbial DNA, primarily from NCBI. Created from fungal, viral, bacterial, archeael, protist, and plasmid DNA sequences
Please reach out to James Riddell (riddell.26@buckeyemail.osu.edu) or Bridget Hegarty (beh53@case.edu) regarding any issues, or open an issue on github.
library(ggplot2)
There were 50 or more warnings (use warnings() to see the first 50)
library(plyr)
library(reshape2)
library(viridis)
library(tidyr)
library(dplyr)
library(readr)
library(data.table)
library(pROC)
library("stringr")
Import the file that combines the results from each of the tools from running “combining_tool_output.Rmd”:
viruses <- read_tsv("../IntermediaryFiles/viral_tools_combined.tsv")
── Column specification ───────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
seqtype = col_character(),
contig = col_character(),
checkv_provirus = col_character(),
checkv_quality = col_character(),
method.x = col_character(),
Classified = col_character(),
IDs_all = col_character(),
Seq = col_character(),
Kaiju_Viral = col_character(),
Kingdom = col_character(),
type = col_character(),
vibrant_quality = col_character(),
method.y = col_character(),
vibrant_prophage = col_character(),
vs2type = col_character(),
max_score_group = col_character(),
provirus = col_logical()
)
ℹ Use `spec()` for the full column specifications.
There were 50 or more warnings (use warnings() to see the first 50)
This section defines a viralness score “keep_score” based on the tool classifications. A final keep_score above 1 indicates we will keep that sequence and call it viral.
VIBRANT Quality == “Complete Circular”: +1 Quality == “High Quality Draft”: +1 Quality == “Medium Quality Draft”: +1 Quality == “Low Quality Draft” & provirus: +0.5
Virsorter2 Viral >= 50: +0.5 Viral >= 0.95: +0.5 RNA >= 0.9: +1 lavidaviridae >= 0.9: +1 NCLDV >= 0.9: +1
Virsorter category == 1,4: +1 category == 2,5: +0.5
DeepVirFinder: Score >= 0.7: +0.5
Tuning - No Viral Signature: Kaiju_viral = “cellular organisms”: -0.5 If host_genes >50 and NOT provirus: -1 If viral_genes == 0 and host_genes >= 1: -1 If 3*viral_genes <= host_genes and NOT provirus: -1 If length > 50,000 and hallmark <=1: -1 If length < 5000 and checkv completeness <= 75: -0.5
Tuning - Viral Signature: Kaiju_viral = “Viruses”: +0.5 If %unknown >= 75 and length < 50000: +0.5 If %viral >= 50: +0.5 Hallmark > 2: +0.5
This script produces visualizations of these combined viral scorings and includes ecological metrics like alpha diversity.
You can decide which combination is appropriate for them and only need use the tools appropriate for your data.
getting_viral_set_1 <- function(input_seqs,
include_vibrant=FALSE,
include_virsorter2=FALSE,
include_deepvirfinder=FALSE,
include_tuning_viral=FALSE,
include_tuning_not_viral=FALSE,
include_virsorter=FALSE) {
keep_score <- rep(0, nrow(input_seqs))
if (include_vibrant) {
keep_score[input_seqs$vibrant_quality=="complete circular"] <- keep_score[input_seqs$vibrant_quality=="complete circular"] + 1
keep_score[input_seqs$vibrant_quality=="high quality draft"] <- keep_score[input_seqs$vibrant_quality=="high quality draft"] + 1
keep_score[input_seqs$vibrant_quality=="medium quality draft"] <- keep_score[input_seqs$vibrant_quality=="medium quality draft"] + 1
keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus] <- keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus] + 0.5
}
if (include_virsorter2) {
keep_score[input_seqs$viral>=50 | input_seqs$lavidaviridae>=0.95 | input_seqs$NCLDV>=0.95] <- keep_score[input_seqs$viral>=50 | input_seqs$lavidaviridae>=0.95 | input_seqs$NCLDV>=0.95] + 0.5
keep_score[input_seqs$viral>=95] <- keep_score[input_seqs$viral>=95] + 0.5
keep_score[input_seqs$RNA>=0.95] <- keep_score[input_seqs$RNA>=0.95] + 1
}
if (include_virsorter) {
keep_score[input_seqs$category==1] <- keep_score[input_seqs$category==1] + 1
keep_score[input_seqs$category==2] <- keep_score[input_seqs$category==2] + 0.5
keep_score[input_seqs$category==4] <- keep_score[input_seqs$category==4] + 1
keep_score[input_seqs$category==5] <- keep_score[input_seqs$category==5] + 0.5
}
if (include_deepvirfinder) {
keep_score[input_seqs$score>=0.7 & input_seqs$checkv_length<20000] <- keep_score[input_seqs$score>=0.7 & input_seqs$checkv_length<20000] + 0.5
keep_score[input_seqs$score>=0.9 & input_seqs$checkv_length<20000] <- keep_score[input_seqs$score>=0.9 & input_seqs$checkv_length<20000] + 0.5
}
if (include_tuning_viral) {
keep_score[input_seqs$Kaiju_Viral=="Viruses"] <- keep_score[input_seqs$Kaiju_Viral=="Viruses"] + 0.5
keep_score[input_seqs$hallmark>2] <- keep_score[input_seqs$hallmark>2] + 0.5
keep_score[input_seqs$percent_unknown>=75 & input_seqs$checkv_length<50000] <- keep_score[input_seqs$percent_unknown>=75 & input_seqs$checkv_length<50000] + 0.5
keep_score[input_seqs$percent_viral>=50] <- keep_score[input_seqs$percent_viral>=50] + 0.5
}
if (include_tuning_not_viral) {
keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] <- keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] - 0.5
keep_score[input_seqs$checkv_host_genes>50 & !input_seqs$provirus] <- keep_score[input_seqs$checkv_host_genes>50 & !input_seqs$provirus] - 1
keep_score[input_seqs$checkv_viral_genes==0 & input_seqs$checkv_host_genes>=1] <- keep_score[input_seqs$checkv_viral_genes==0 & input_seqs$checkv_host_genes>=1] - 1
keep_score[((input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes) & !input_seqs$provirus] <- keep_score[((input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes) & !input_seqs$provirus] - 1
keep_score[input_seqs$checkv_length>500000 & input_seqs$hallmark<=1] <- keep_score[input_seqs$checkv_length>500000 & input_seqs$hallmark<=1] - 1
keep_score[input_seqs$checkv_completeness<=75 & input_seqs$checkv_length<=5000] <- keep_score[input_seqs$checkv_completeness<=75 & input_seqs$checkv_length<=5000] - 0.5
}
return(keep_score)
}
note that this is only as accurate as the annotations of the input sequences
this function calculates the precision, recall, and F1 score for each pipeline
assess_performance <- function(seqtype, keep_score) {
truepositive <- rep("not viral", length(seqtype))
truepositive[seqtype=="virus"] <- "viral"
#make confusion matrix
confusion_matrix <- rep("true negative", length(keep_score))
confusion_matrix[truepositive=="viral" & keep_score<=1] <- "false negative"
confusion_matrix[truepositive=="viral" & keep_score>=1] <- "true positive"
confusion_matrix[truepositive=="not viral" & keep_score>=1] <- "false positive"
TP <- table(confusion_matrix)[4]
FP <- table(confusion_matrix)[2]
TN <- table(confusion_matrix)[3]
FN <- table(confusion_matrix)[1]
precision <- TP/(TP+FP)
recall <- TP/(TP+FN)
F1 <- 2*precision*recall/(precision+recall)
MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
auc <- round(auc(truepositive, keep_score),4)
#by type metrics
fungal_FP <- table(confusion_matrix[seqtype=="fungi"])[2]
protist_FP <- table(confusion_matrix[seqtype=="protist"])[2]
bacterial_FP <- table(confusion_matrix[seqtype=="bacteria"])[2]
viral_FN <- table(confusion_matrix[seqtype=="virus"])[1]
performance <- c(precision, recall, F1, MCC, auc, fungal_FP,
protist_FP, bacterial_FP, viral_FN)
names(performance) <- c("precision", "recall", "F1", "MCC", "AUC", "fungal_FP",
"protist_FP", "bacterial_FP", "viral_FN")
return(performance)
}
combination of tools list
combos_list <- data.frame(toolcombo=rep(0, 64),
tune_not_viral=rep(0, 64),
DVF=rep(0, 64),
tune_viral=rep(0, 64),
VIBRANT=rep(0, 64),
VS=rep(0, 64),
VS2=rep(0, 64))
p <- 1
for (i in c(0,1)){
for (j in c(0,1)){
for (k in c(0,1)){
for (l in c(0,1)){
for (m in c(0,1)){
for (n in c(0,1)){
combos_list$toolcombo[p] <- paste(i,j,k,l,m,n)
combos_list$toolcombo2[p] <- paste(if(i){"tv"}else{"0"},if(j){"DVF"}else{"0"},
if(k){"tnv"}else{"0"},if(l){"VB"}else{"0"},
if(m){"VS"}else{"0"},if(n){"VS2"}else{"0"})
combos_list$tune_not_viral[p] <- i
combos_list$DVF[p] <- j
combos_list$tune_viral[p] <- k
combos_list$VIBRANT[p] <- l
combos_list$VS[p] <- m
combos_list$VS2[p] <- n
p <- p+1
}
}
}
}
}
}
combos_list <- combos_list[-1,]
this function builds a list of all of the combinations that the user wants to test. In this case, we’re comparing the performance of all unique combinations of the six tools.
build_score_list <- function(input_seqs, combos) {
output <- data.frame(precision=rep(0, nrow(combos)),
recall=rep(0, nrow(combos)),
F1=rep(0, nrow(combos)),
MCC=rep(0, nrow(combos)),
AUC=rep(0, nrow(combos)),
fungal_FP=rep(0, nrow(combos)),
protist_FP=rep(0, nrow(combos)),
bacterial_FP=rep(0, nrow(combos)),
viral_FN=rep(0, nrow(combos)))
for (i in 1:nrow(combos)) {
keep_score <- getting_viral_set_1(input_seqs, include_vibrant = combos$VIBRANT[i],
include_virsorter = combos$VS[i],
include_virsorter2 = combos$VS2[i],
include_tuning_viral = combos$tune_viral[i],
include_tuning_not_viral = combos$tune_not_viral[i],
include_deepvirfinder = combos$DVF[i])
output[i,1:9] <- assess_performance(input_seqs$seqtype, keep_score)
output$toolcombo[i] <- paste(combos$tune_viral[i],combos$DVF[i],
combos$tune_not_viral[i], combos$VIBRANT[i],
combos$VS[i], combos$VS2[i])
}
output[is.na(output)] <- 0
return (output)
}
accuracy_scores <- data.frame(testing_set_index=rep(0, nrow(combos_list)*10),
precision=rep(0, nrow(combos_list)*10),
recall=rep(0, nrow(combos_list)*10),
F1=rep(0, nrow(combos_list)*10),
MCC=rep(0, nrow(combos_list)*10),
AUC=rep(0, nrow(combos_list)*10),
fungal_FP=rep(0, nrow(combos_list)*10),
protist_FP=rep(0, nrow(combos_list)*10),
bacterial_FP=rep(0, nrow(combos_list)*10),
viral_FN=rep(0, nrow(combos_list)*10))
accuracy_scores <- cbind(testing_set_index=rep(1, nrow(combos_list)),
build_score_list(viruses[viruses$Index==1,], combos_list))
for (i in 2:10) {
accuracy_scores <- rbind(accuracy_scores,
cbind(testing_set_index=rep(i, nrow(combos_list)),
build_score_list(viruses[viruses$Index==i,], combos_list)))
}
accuracy_scores$numrules <- str_count(accuracy_scores$toolcombo, "1")
#accuracy_scores <- accuracy_scores[order(accuracy_scores$numrules, decreasing=F),]
accuracy_scores <- accuracy_scores[order(accuracy_scores$MCC, decreasing=F),]
accuracy_scores$toolcombo <- factor(accuracy_scores$toolcombo, levels = unique(accuracy_scores$toolcombo))
accuracy_scores$numrules <- as.factor(accuracy_scores$numrules)
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
p2 <- ggplot(accuracy_scores, aes(x=toolcombo, y=F1,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("F1 Score")
p2
ggplot(accuracy_scores, aes(x=toolcombo, y=precision,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Precision")
ggplot(accuracy_scores, aes(x=toolcombo, y=recall,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Recall")
ggplot(accuracy_scores, aes(x=precision, y=recall,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Precision") +
ylab("Recall")
ggplot(accuracy_scores[accuracy_scores$testing_set_index==1,], aes(x=precision, y=recall)) +
geom_label(alpha=0.7, label=accuracy_scores$toolcombo[accuracy_scores$testing_set_index==1]) +
geom_point(alpha=0.5, aes(color=numrules, fill=numrules)) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Precision") +
ylab("Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=abs(precision-recall),
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Precision-Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=MCC,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("MCC")
ggplot(accuracy_scores, aes(x=toolcombo, y=AUC,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("AUC")
ggplot(accuracy_scores, aes(x=toolcombo, y=fungal_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Fungal False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=protist_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Protist False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=bacterial_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Bacterial False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=viral_FN,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Viral False Negatives")
ggplot(accuracy_scores, aes(x=toolcombo, y=viral_FN,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Viral False Negatives")
ggplot(accuracy_scores, aes(x=protist_FP, y=fungal_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Protist FP") +
ylab("Fungal FP")
ggplot(accuracy_scores, aes(x=recall, y=fungal_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Recall") +
ylab("Fungal FP")
ggplot(accuracy_scores, aes(x=recall, y=protist_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Recall") +
ylab("Protist FP")
accuracy_scores_melt <- accuracy_scores %>%
select(testing_set_index, precision, recall, MCC, numrules, toolcombo) %>%
pivot_longer(cols=c(precision, recall, MCC),
names_to="performance_metric",
values_to="performance_metric_score")
ggplot(accuracy_scores_melt, aes(x=numrules, y=performance_metric_score,
There were 25 warnings (use warnings() to see them)
color=numrules, fill=numrules)) +
geom_boxplot() +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("Number of Rule Sets") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
facet_wrap(~performance_metric)
comparing metric with and without tuning rules
accuracy_scores_melt$tuning_inc <- "no"
accuracy_scores_melt$tuning_inc[substring(accuracy_scores_melt$toolcombo, 1, 1)==1] <- "tv"
accuracy_scores_melt$tuning_inc[substring(accuracy_scores_melt$toolcombo, 3, 3)==1] <- "tnv"
accuracy_scores_melt$tuning_inc[substring(accuracy_scores_melt$toolcombo, 1, 1)==1 &
substring(accuracy_scores_melt$toolcombo, 3, 3)==1] <- "both"
ggplot(accuracy_scores_melt, aes(x=tuning_inc, y=performance_metric_score)) +
There were 50 or more warnings (use warnings() to see the first 50)
geom_boxplot() +
geom_point(aes(color=numrules, fill=numrules), alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("Number of Tools") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
facet_wrap(~performance_metric)
ggplot(accuracy_scores_melt, aes(x=tuning_inc, y=performance_metric_score,
color=numrules, fill=numrules)) +
geom_boxplot() +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("Number of Tools") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
facet_wrap(~performance_metric)
ggplot(accuracy_scores_melt, aes(x=tuning_inc, y=performance_metric_score)) +
geom_boxplot() +
geom_boxplot(aes(color=numrules, fill=numrules)) +
# geom_point(aes(color=numrules, fill=numrules), alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
facet_wrap(~performance_metric)
NA
write_tsv(accuracy_scores, "20221029_accuracy_scores.tsv")
There were 50 or more warnings (use warnings() to see the first 50)
to do: add in clustering and ordination like in the drinking water R notebook
viruses$keep_score_high_precision <- getting_viral_set_1(viruses, include_deepvirfinder = T,
There were 50 or more warnings (use warnings() to see the first 50)
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = T,
include_virsorter = F)
viruses$confusion_matrix_high_precision <- "true negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision<1] <- "false negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision>=1] <- "true positive"
viruses$confusion_matrix_high_precision[viruses$seqtype!="virus" & viruses$keep_score_high_precision>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index))
The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
length(grep("true", viruses$confusion_matrix_high_precision))/nrow(viruses)
[1] 0.9206364
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
this rule set had the highest precision, but as you can see, this comes with a big sacrifice in recall
viruses$keep_score_high_MCC <- getting_viral_set_1(viruses, include_deepvirfinder = F,
There were 50 or more warnings (use warnings() to see the first 50)
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses$confusion_matrix_high_MCC <- "true negative"
viruses$confusion_matrix_high_MCC[viruses$seqtype=="virus" & viruses$keep_score_high_MCC<1] <- "false negative"
viruses$confusion_matrix_high_MCC[viruses$seqtype=="virus" & viruses$keep_score_high_MCC>=1] <- "true positive"
viruses$confusion_matrix_high_MCC[viruses$seqtype!="virus" & viruses$keep_score_high_MCC>=1] <- "false positive"
accuracy:
length(grep("true", viruses$confusion_matrix_high_MCC))/nrow(viruses)
[1] 0.9450831
recall
length(grep("true positive", viruses$confusion_matrix_high_MCC))/length(grep("virus", viruses$seqtype))
[1] 0.8394
There were 22 warnings (use warnings() to see them)
TP <- table(viruses$confusion_matrix_high_MCC)[4]
FP <- table(viruses$confusion_matrix_high_MCC)[2]
TN <- table(viruses$confusion_matrix_high_MCC)[3]
FN <- table(viruses$confusion_matrix_high_MCC)[1]
precision <- as.numeric(TP/(TP+FP))
precision
[1] 0.6950977
recall <- as.numeric(TP/(TP+FN))
recall
[1] 0.8394
F1 <- as.numeric(2*precision*recall/(precision+recall))
F1
[1] 0.7604639
MCC <- as.numeric((TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN)))
MCC
[1] 0.7339017
precision=69%, recall=87%, MCC=77%
precision adjusting size to be equal viral/not viral
TP <- table(viruses$confusion_matrix_high_MCC)[4]
There were 50 or more warnings (use warnings() to see the first 50)
FP <- table(viruses$confusion_matrix_high_MCC)[2]*.11
TN <- table(viruses$confusion_matrix_high_MCC)[3]*.11
FN <- table(viruses$confusion_matrix_high_MCC)[1]
precision <- as.numeric(TP/(TP+FP))
precision
[1] 0.9539699
recall <- as.numeric(TP/(TP+FN))
recall
[1] 0.8394
F1 <- as.numeric(2*precision*recall/(precision+recall))
F1
[1] 0.8930253
MCC <- as.numeric((TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN)))
MCC
[1] 0.8002465
precision=0.95, recall=0.87, F1=0.91, MCC=0.82
visualizing confusion matrix by taxa
confusion_by_taxa <- viruses %>% count(confusion_matrix_high_MCC, seqtype, Index)
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
differences based on genome size
viruses$size_class <- "3-5kb"
There were 50 or more warnings (use warnings() to see the first 50)
viruses$size_class[viruses$checkv_length>5000] <- "5-10kb"
viruses$size_class[viruses$checkv_length>10000] <- ">10kb"
confusion_by_taxa <- viruses %>% count(confusion_matrix_high_MCC, seqtype, size_class, Index)
There were 50 or more warnings (use warnings() to see the first 50)
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","size", "index", "count")
confusion_vir_called <- confusion_by_taxa %>% filter(confusion_matrix=="true positive" | confusion_matrix=="false positive")
type_count <- viruses %>% count(seqtype, size_class, Index)
confusion_vir_called$per_viral <- 0
for (i in c(1:nrow(confusion_vir_called))) {
confusion_vir_called$per_viral[i] <- confusion_vir_called$count[i]/type_count$n[type_count$seqtype==confusion_vir_called$seqtype[i] &
type_count$Index==confusion_vir_called$index[i] &
type_count$size_class==confusion_vir_called$size[i]]*100
}
confusion_vir_called <- confusion_vir_called %>% group_by(seqtype, size) %>%
summarise(mean=mean(per_viral),
sd=sd(per_viral))
`summarise()` has grouped output by 'seqtype'. You can override using the `.groups` argument.
confusion_vir_called$size <- factor(confusion_vir_called$size,
levels = c("3-5kb", "5-10kb", ">10kb"))
ggplot(confusion_vir_called, aes(y=mean, x=size,
There were 20 warnings (use warnings() to see them)
fill=seqtype,
color=seqtype)) +
geom_bar(stat="identity", position=position_dodge()) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,
position=position_dodge(.9)) +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
xlab("Length") +
ylab("Sequences Called Viral (%)")
viruses$keep_score_vb <- getting_viral_set_1(viruses, include_deepvirfinder = F,
There were 50 or more warnings (use warnings() to see the first 50)
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf_vs2 <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf_vs2_vs <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$keep_score_vb_dvf_vs2_vs_tv <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$keep_score_vb_dvf_vs2_vs_tv_tnv <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses_high <- viruses[viruses$keep_score_vb_dvf_vs2_vs_tv>=1,]
There were 20 warnings (use warnings() to see them)
viruses_high_mod <- viruses_high %>% select(keep_score_vb,keep_score_vb_dvf,
keep_score_vb_dvf_vs2, keep_score_vb_dvf_vs2_vs,
keep_score_vb_dvf_vs2_vs_tv, keep_score_vb_dvf_vs2_vs_tv_tnv)
#viruses_high_mod <- apply(viruses_high_mod, c(1,2), function(x) {if (x >= 1) {x <- 1} else {x <- 0}})
viruses_high_mod <- as_tibble(viruses_high_mod)
sm_m <- reshape2::melt(viruses_high_mod)
No id variables; using all as measure variables
colnames(sm_m) <- c("method", "viral_score")
sm_m <- sm_m[sm_m$viral_score>0,]
sm_m$score <- sm_m$viral_score
sm_m$score[sm_m$viral_score==0.5] <- "0.5"
sm_m$score[sm_m$viral_score>=1] <- "1"
sm_m$score[sm_m$viral_score>=2] <- "2"
sm_m$score[sm_m$viral_score>=3] <- "3"
sm_m$score[sm_m$viral_score>=4] <- "4"
sm_m$score[sm_m$viral_score>=5] <- "5"
sm_m$score <- factor(sm_m$score,
levels=c("0.5", "1", "2","3","4","5"))
ggplot(sm_m, aes(x=method, y=score,
fill=score)) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "bottom"
) +
scale_fill_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 1)) +
xlab("") +
ylab("Number of Sequences") +
coord_flip()
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
viruses_mcc_alluvial <- data.frame(seqtype=viruses$seqtype,
There were 50 or more warnings (use warnings() to see the first 50)
keep_score_high_MCC=viruses$keep_score_high_MCC,
confusion_matrix_high_MCC=viruses$confusion_matrix_high_MCC)
viruses_mcc_alluvial$keep_score_vb <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_mcc_alluvial$keep_score_dvf <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_mcc_alluvial$keep_score_vs <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = F,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = T)
viruses_mcc_alluvial$keep_score_vs2 <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = F,
include_virsorter2 = T,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_mcc_alluvial$keep_score_tv <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = F,
include_virsorter2 = F,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_mcc_alluvial$keep_score_tnv <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = T,
include_virsorter = F)
viruses_mcc_alluvial %>%
count(seqtype, keep_score_high_MCC) %>%
spread(key = keep_score_high_MCC, value=n)
viruses_mcc_alluvial <- viruses_mcc_alluvial %>%
There were 28 warnings (use warnings() to see them)
count(seqtype, keep_score_dvf, keep_score_vb, keep_score_vs,
keep_score_vs2, keep_score_tv, keep_score_tnv, keep_score_high_MCC) %>%
mutate(high_mcc_viral_score=factor(round(keep_score_high_MCC)))
ggplot(viruses_mcc_alluvial,
There were 50 or more warnings (use warnings() to see the first 50)
aes(axis1 = keep_score_dvf, axis2 = keep_score_vb,
axis3 = keep_score_vs, axis4 = keep_score_vs2,
axis5 = keep_score_tv, axis6 = keep_score_tnv,
y=n)) +
geom_alluvium(aes(fill=high_mcc_viral_score),
width = 0, knot.pos = 0, reverse = FALSE) +
geom_stratum(width = 1/5) +
theme_bw() +
geom_text(stat = "stratum", aes(label = after_stat(stratum)),
reverse = FALSE) +
theme(
axis.text.x=element_text(size=14, angle = 90)
) +
scale_x_continuous(breaks=c(1,2,3,4,5,6),
labels=c("dvf", "kj", "vs", "vs2",
"tv", "tnv")) +
scale_fill_brewer(palette = "PuOr", ) +
facet_wrap(~seqtype, scales="free_y")
viruses$keep_score_visualize <- viruses$keep_score_high_MCC
viruses$keep_score_visualize[viruses$keep_score_high_MCC>1] <- "> 1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==1] <- "1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==0.5] <- "0.5"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==0] <- "0"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==-0.5] <- "-0.5"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==-1] <- "-1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC<=-1] <- "< -1"
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize,
levels=c("< -1", "-1", "-0.5", "0", "0.5","1", "> 1"))
#viruses$keep_score_visualize <- factor(viruses$keep_score_visualize,
# labels=c("≤ 0", "≤ 0", "≤ 0", "0.5","1", "> 1"))
levels(factor(viruses$keep_score_visualize))
ggplot(viruses, aes(x=as.factor(Index),
fill=keep_score_visualize, color=keep_score_visualize)) +
geom_bar(stat="count", position="stack") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16)
) +
scale_color_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 1)) +
scale_fill_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 0.5)) +
xlab("Index") +
ylab("Sequence Count") +
facet_wrap(~confusion_matrix_high_MCC, scales = "free")
viral_scores <- matrix(data=0, nrow=nrow(viruses), ncol=nrow(combos_list))
There were 50 or more warnings (use warnings() to see the first 50)
num_viruses <- data.frame(toolcombo=rep(0, nrow(combos_list)),
num_viruses=rep(0, nrow(combos_list)))
for (i in 1:nrow(combos_list)) {
viral_scores[,i] <- getting_viral_set_1(viruses, include_vibrant = combos_list$VIBRANT[i],
include_virsorter = combos_list$VS[i],
include_virsorter2 = combos_list$VS2[i],
include_tuning_viral = combos_list$tune_viral[i],
include_tuning_not_viral = combos_list$tune_not_viral[i],
include_deepvirfinder = combos_list$DVF[i])
if (max(viral_scores[,i])<=0) {
num_viruses$num_viruses[i] <- 0
}
else {
num_viruses$num_viruses[i] <- table(viral_scores[,i]>=1)[[2]]
}
num_viruses$toolcombo[i] <- combos_list$toolcombo[i]
num_viruses$toolcombo2[i] <- combos_list$toolcombo2[i]
}
num_viruses$numtools <- str_count(num_viruses$toolcombo, "1")
num_viruses <- num_viruses[order(num_viruses$num_viruses, decreasing=F),]
num_viruses$toolcombo <- factor(num_viruses$toolcombo, levels = unique(num_viruses$toolcombo))
num_viruses$toolcombo2 <- factor(num_viruses$toolcombo2, levels = unique(num_viruses$toolcombo2))
num_viruses$numtools <- as.factor(num_viruses$numtools)
viral_scores_nozeros <- viral_scores[rowSums(viral_scores)>0,]
There were 23 warnings (use warnings() to see them)
viral_scores_nozeros <- viral_scores_nozeros + 1
viral_scores_nozeros <- as.data.frame(viral_scores_nozeros)
colnames(viral_scores_nozeros) <- num_viruses$toolcombo
library(phyloseq)
tooldata <- num_viruses
There were 23 warnings (use warnings() to see them)
rownames(tooldata) <- tooldata$toolcombo
physeq_pooled <- phyloseq(otu_table(viral_scores_nozeros, taxa_are_rows = T),
sample_data(tooldata))
Error in validObject(.Object) : invalid class “phyloseq” object:
Component sample names do not match.
Try sample_names()
ordination <- phyloseq::ordinate(physeq =physeq_pooled, method = "PCoA", distance = "bray")
results may be meaningless because data have negative entries in method “bray”
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numtools", color="num_viruses") +
geom_point(size = 3) +
theme_bw() +
geom_label(label=tooldata$toolcombo)
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numtools", color="num_viruses") +
geom_point(size = 3) +
theme_bw()
myclusters <- cutree(clusters, h=0.5)
There were 50 or more warnings (use warnings() to see the first 50)
#names(myclusters[myclusters==1])
#names(myclusters[myclusters==2])
#names(myclusters[myclusters==3])
#names(myclusters[myclusters==4])
#names(myclusters[myclusters==5])
myclusters_df <- tibble(combo=names(myclusters),
cluster_index=myclusters)
myclusters_df <- separate(myclusters_df, col=combo, into=c("tnv", "DVF",
"tv", "VB",
"VS", "VS2"),
sep=" ", remove = F)
tool_count <- as.data.frame(rbind(table(myclusters_df$tnv, myclusters_df$cluster_index)[2,],
table(myclusters_df$DVF, myclusters_df$cluster_index)[2,],
table(myclusters_df$tv, myclusters_df$cluster_index)[2,],
table(myclusters_df$VB, myclusters_df$cluster_index)[2,],
table(myclusters_df$VS, myclusters_df$cluster_index)[2,],
table(myclusters_df$VS2, myclusters_df$cluster_index)[2,])
)
tool_count <- data.frame(t(apply(tool_count, c(1), function(x) {x <- x/table(myclusters_df$cluster_index)})))
tool_count$method <- c("tnv", "DVF", "tv", "VB", "VS", "VS2")
tool_count <- melt(tool_count)
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(tool_count). In the next version, this warning will become an error.Using method as id variables
colnames(tool_count) <- c("tool", "cluster_index", "tool_count_norm")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
There were 23 warnings (use warnings() to see them)
ggplot(tool_count, aes(x=tool, y=tool_count_norm,
fill=tool,
color=tool)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "none",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
#legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
xlab("Tool") +
ylab("Proportion of Times in Cluster") +
facet_wrap(~cluster_index, nrow=1)
accuracy_scores_melt <- accuracy_scores %>%
There were 46 warnings (use warnings() to see them)
select(precision, recall, MCC, numtools, toolcombo) %>%
group_by(numtools, toolcombo) %>%
summarise(precision=mean(precision),
recall=mean(recall),
MCC=mean(MCC)) %>%
pivot_longer(cols=c(precision, recall, MCC),
names_to="performance_metric",
values_to="performance_metric_score")
`summarise()` has grouped output by 'numtools'. You can override using the `.groups` argument.
myclusters_df <- inner_join(accuracy_scores_melt, myclusters_df,
by=c("toolcombo"="combo"))
myclusters_df$cluster_index <- as.factor(myclusters_df$cluster_index)
ggplot(myclusters_df, aes(x=cluster_index, y=performance_metric_score,
There were 46 warnings (use warnings() to see them)
color=cluster_index, fill=cluster_index)) +
geom_boxplot() +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("Cluster") +
scale_fill_manual(name="",
values = alpha(rev(pal(9)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(9)), 1)) +
facet_wrap(~performance_metric)
viruses$keep_score_all <- getting_viral_set_1(viruses, include_deepvirfinder = T,
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses$confusion_matrix_all <- "true negative"
There were 19 warnings (use warnings() to see them)
viruses$confusion_matrix_all[viruses$seqtype=="virus" & viruses$keep_score_all<1] <- "false negative"
viruses$confusion_matrix_all[viruses$seqtype=="virus" & viruses$keep_score_all>=1] <- "true positive"
viruses$confusion_matrix_all[viruses$seqtype!="virus" & viruses$keep_score_all>=1] <- "false positive"
TP <- table(viruses$confusion_matrix_all)[4]
There were 32 warnings (use warnings() to see them)
FP <- table(viruses$confusion_matrix_all)[2]
TN <- table(viruses$confusion_matrix_all)[3]
FN <- table(viruses$confusion_matrix_all)[1]
precision <- TP/(TP+FP)
precision
true positive
0.6258661
recall <- TP/(TP+FN)
recall
true positive
0.9214
F1 <- 2*precision*recall/(precision+recall)
F1
true positive
0.7454089
MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
MCC
true positive
0.7269526
precision=62%, recall=92%, MCC=73%
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_all, viruses$seqtype, viruses$Index))
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
table(viruses$confusion_matrix_all)
length(grep("true", viruses$confusion_matrix_all))/nrow(viruses)
length(grep("true positive", viruses$confusion_matrix_all))/length(grep("virus", viruses$seqtype))
[1] 0.9214
There were 16 warnings (use warnings() to see them)
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
viruses$keep_score_high_recall <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$confusion_matrix_high_recall <- "true negative"
There were 26 warnings (use warnings() to see them)
viruses$confusion_matrix_high_recall[viruses$seqtype=="virus" & viruses$keep_score_high_recall<1] <- "false negative"
viruses$confusion_matrix_high_recall[viruses$seqtype=="virus" & viruses$keep_score_high_recall>=1] <- "true positive"
viruses$confusion_matrix_high_recall[viruses$seqtype!="virus" & viruses$keep_score_high_recall>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_recall, viruses$seqtype, viruses$Index))
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
p2 <- ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
p2
accuracy:
length(grep("true", viruses$confusion_matrix_high_recall))/nrow(viruses)
0.887
recall
length(grep("true positive", viruses$confusion_matrix_high_recall))/length(grep("virus", viruses$seqtype))
recover almost all of the viruses this way, but more protist contamination
0.960
confusion_by_taxa <- viruses %>% count(confusion_matrix_high_recall, seqtype, size_class)
There were 34 warnings (use warnings() to see them)
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","size", "count")
viruses$keep_score_few_tools <- getting_viral_set_1(viruses, include_deepvirfinder = F,
There were 50 or more warnings (use warnings() to see the first 50)
include_vibrant = F,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = F)
viruses$confusion_matrix_few_tools <- "true negative"
There were 19 warnings (use warnings() to see them)
viruses$confusion_matrix_few_tools[viruses$seqtype=="virus" & viruses$keep_score_few_tools<1] <- "false negative"
viruses$confusion_matrix_few_tools[viruses$seqtype=="virus" & viruses$keep_score_few_tools>=1] <- "true positive"
viruses$confusion_matrix_few_tools[viruses$seqtype!="virus" & viruses$keep_score_few_tools>=1] <- "false positive"
TP <- table(viruses$confusion_matrix_few_tools)[4]
FP <- table(viruses$confusion_matrix_few_tools)[2]
TN <- table(viruses$confusion_matrix_few_tools)[3]
FN <- table(viruses$confusion_matrix_few_tools)[1]
precision <- TP/(TP+FP)
precision
true positive
0.7729702
recall <- TP/(TP+FN)
recall
true positive
0.7664
F1 <- 2*precision*recall/(precision+recall)
F1
true positive
0.7696711
MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
MCC
true positive
0.7431145
precision=77%, recall=76%, MCC=74%
Extra Stuff #####################################################################
ggplot(viruses, aes(x=checkv_length, y=keep_score_high_MCC,
fill=confusion_matrix_high_MCC,
color=confusion_matrix_high_MCC)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Sequence Length (bp)") +
ylab("Pipeline Viral Score") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=checkv_completeness, y=hallmark,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("CheckV Completeness") +
ylab("Number of Hallmark Genes") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=checkv_completeness, y=keep_score_high_MCC,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("CheckV Completeness") +
ylab("Pipeline Viral Score") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=confusion_matrix_high_recall, y=checkv_length,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_boxplot() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Sequence Length (bp)") +
ylab("Pipeline Viral Score") +
scale_y_log10()
looking at false negatives
viruses_false_negs <- viruses[(viruses$seqtype=="virus" & viruses$keep_score_high_recall<1),]
looking at protists calling viral
viruses_false_pos_protists <- viruses[(viruses$seqtype=="protist" & viruses$keep_score_high_recall>=1),]
viruses$keep_score_vb <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_tv <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_high <- viruses[viruses$keep_score_vb_tv>=1,] #uncomment this line if want to use all 6 tools
viruses_high_mod <- viruses_high %>% select(keep_score_vb,
keep_score_vb_tv)
#viruses_high_mod <- apply(viruses_high_mod, c(1,2), function(x) {if (x >= 1) {x <- 1} else {x <- 0}})
viruses_high_mod <- as_tibble(viruses_high_mod)
sm_m <- reshape2::melt(viruses_high_mod)
colnames(sm_m) <- c("method", "score")
ggplot(sm_m, aes(x=method, y=score,
fill=as.factor(score))) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom"
) +
scale_fill_manual(name = 'Number of Methods',
values = alpha(c(viridis(14)), 1)) +
xlab("Primary Method") +
ylab("Count of Viral Contigs") +
coord_flip()
library(pROC)
viruses$truepositive <- rep(0, nrow(viruses))
viruses$truepositive[viruses$seqtype=="virus"] <- 1
rocobj <- roc(viruses$truepositive, viruses$keep_score)
rocobj_all <- roc(viruses$truepositive, viruses$keep_score_all)
auc <- round(auc(viruses$truepositive, viruses$keep_score),4)
auc_all <- round(auc(viruses$truepositive, viruses$keep_score_all),4)
#create ROC plot
ggroc(rocobj, colour = 'steelblue', size = 2) +
ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) +
coord_equal()
ggroc(rocobj_all, colour = 'green', size = 2) +
ggtitle(paste0('ROC Curve ', '(AUC = ', auc_all, ')'))
Sensitivity: The probability that the model predicts a positive outcome for an observation when indeed the outcome is positive. Specificity: The probability that the model predicts a negative outcome for an observation when indeed the outcome is negative.
viral_scores <- matrix(data=0, nrow=nrow(viruses), ncol=nrow(combos_list))
num_viruses <- data.frame(toolcombo=rep(0, nrow(combos_list)),
num_viruses=rep(0, nrow(combos_list)))
for (i in 1:nrow(combos_list)) {
viral_scores[,i] <- getting_viral_set_1(viruses, include_vibrant = combos_list$VIBRANT[i],
include_virsorter = combos_list$VS[i],
include_virsorter2 = combos_list$VS2[i],
include_tuning = combos_list$CheckV[i],
include_kaiju = combos_list$Kaiju[i],
include_deepvirfinder = combos_list$DVF[i])
num_viruses$num_viruses[i] <- table(viral_scores[,i]>=1)[[2]]
num_viruses$toolcombo[i] <- combos_list$toolcombo[i]
num_viruses$toolcombo2[i] <- combos_list$toolcombo2[i]
}
num_viruses$numrules <- str_count(num_viruses$toolcombo, "1")
num_viruses <- num_viruses[order(num_viruses$num_viruses, decreasing=F),]
num_viruses$toolcombo <- factor(num_viruses$toolcombo, levels = unique(num_viruses$toolcombo))
num_viruses$toolcombo2 <- factor(num_viruses$toolcombo2, levels = unique(num_viruses$toolcombo2))
num_viruses$numrules <- as.factor(num_viruses$numrules)
ggplot(num_viruses, aes(x=toolcombo, y=num_viruses,
color=numrules, fill=numrules)) +
geom_point() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Num Viruses Predicted")
ggplot(num_viruses, aes(x=toolcombo2, y=num_viruses,
color=numrules, fill=numrules)) +
geom_point() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Num Viruses Predicted")
ggplot(num_viruses, aes(x=numrules, y=num_viruses)) +
geom_boxplot(aes(color=numrules)) +
geom_point(aes(color=numrules, fill=numrules)) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Tools") +
ylab("Num Viruses Predicted")
viral_scores_nozeros <- viral_scores[rowSums(viral_scores)>0,]
viral_scores_nozeros <- viral_scores_nozeros + 1
viral_scores_nozeros <- as.data.frame(viral_scores_nozeros)
colnames(viral_scores_nozeros) <- num_viruses$toolcombo2
library(phyloseq)
tooldata <- num_viruses
rownames(tooldata) <- tooldata$toolcombo2
physeq_pooled <- phyloseq(otu_table(viral_scores_nozeros, taxa_are_rows = T),
sample_data(tooldata))
ordination <- phyloseq::ordinate(physeq =physeq_pooled, method = "PCoA", distance = "bray")
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numrules", color="num_viruses") +
geom_point(size = 3) +
theme_bw() +
geom_label(label=tooldata$toolcombo)
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numrules", color="num_viruses") +
geom_point(size = 3) +
theme_bw()
to do: try coloring above based on the F1 scores of the testing set on each combination
bray_dist <- phyloseq::distance(physeq_pooled, method="bray")
clusters <- hclust(dist(bray_dist))
plot(clusters)
myclusters <- cutree(clusters, h=1.1)
names(myclusters[myclusters==1])
names(myclusters[myclusters==2])
names(myclusters[myclusters==3])
names(myclusters[myclusters==4])
names(myclusters[myclusters==5])
myclusters_df <- tibble(combo=names(myclusters),
cluster_index=myclusters)
myclusters_df <- separate(myclusters_df, col=combo, into=c("CheckV", "DVF",
"Kaiju", "VIBRANT",
"VirSorter", "VirSorter2"),
sep=" ", remove = F)
tool_count <- as.data.frame(rbind(table(myclusters_df$CheckV, myclusters_df$cluster_index)[2,],
table(myclusters_df$DVF, myclusters_df$cluster_index)[2,],
table(myclusters_df$Kaiju, myclusters_df$cluster_index)[2,],
table(myclusters_df$VIBRANT, myclusters_df$cluster_index)[2,],
table(myclusters_df$VirSorter, myclusters_df$cluster_index)[2,],
table(myclusters_df$VirSorter2, myclusters_df$cluster_index)[2,])
)
tool_count$method <- c("CheckV", "DVF", "Kaiju", "VIBRANT", "VirSorter", "VirSorter2")
tool_count <- melt(tool_count)
colnames(tool_count) <- c("tool", "cluster_index", "tool_count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(tool_count, aes(x=cluster_index, y=tool_count,
fill=cluster_index,
color=cluster_index)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
xlab("Cluster") +
ylab("Number of Times in Cluster") +
facet_wrap(~tool, scales = "free")
ggplot(viruses, aes(x=checkv_viral_genes, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Viral Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=percent_viral, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Percent Genes Viral") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=hallmark, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Hallmark Genes") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Hallmark Genes") +
ylab("Number of Viral Genes") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
viruses_false_positive <- viruses[viruses$confusion_matrix_high_precision=="false positive",]
viruses_false_negative <- viruses[viruses$confusion_matrix_high_precision=="false negative",]
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
fill=checkv_length,
color=checkv_length,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Number of Viral Genes") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses_false_positive, aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="bacteria"], aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="fungi"], aes(x=hallmark, y=checkv_length,
fill=keep_score_high_precision,
color=keep_score_high_precision,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="protist"], aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
fill=keep_score_high_precision,
color=keep_score_high_precision,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
table(viruses$hallmark[viruses$confusion_matrix_high_precision=="false positive"]>0)
table(viruses$percent_host[viruses$confusion_matrix_high_precision=="false positive"]<50)